# Importing the libraries required.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
# Setting display attributes for better readability of the dataframe.
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('precision', 2)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Loading the dataset
df = pd.read_csv('marketing_data.csv')
# Displaying a random sample of 5 observations.
df.sample(5)
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1141 | 8091 | 1956 | Graduation | Married | $63,943.00 | 0 | 1 | 9/2/12 | 50 | 423 | 184 | 368 | 13 | 97 | 21 | 1 | 6 | 4 | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | IND |
| 1460 | 5966 | 1974 | PhD | Married | $45,207.00 | 1 | 1 | 11/10/12 | 64 | 203 | 0 | 10 | 0 | 0 | 2 | 5 | 3 | 1 | 6 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | SP |
| 1044 | 3933 | 1980 | Graduation | Together | $44,010.00 | 1 | 0 | 10/13/12 | 46 | 186 | 36 | 234 | 86 | 72 | 48 | 6 | 10 | 2 | 6 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SA |
| 682 | 2631 | 1982 | Graduation | Together | $71,853.00 | 0 | 0 | 5/8/13 | 29 | 358 | 108 | 413 | 141 | 97 | 32 | 1 | 2 | 8 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | SP |
| 482 | 2666 | 1972 | Master | Married | $76,234.00 | 0 | 1 | 2/6/14 | 21 | 519 | 50 | 167 | 130 | 0 | 41 | 1 | 8 | 3 | 11 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | CA |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2216 non-null object 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null object 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 AcceptedCmp3 2240 non-null int64 21 AcceptedCmp4 2240 non-null int64 22 AcceptedCmp5 2240 non-null int64 23 AcceptedCmp1 2240 non-null int64 24 AcceptedCmp2 2240 non-null int64 25 Response 2240 non-null int64 26 Complain 2240 non-null int64 27 Country 2240 non-null object dtypes: int64(23), object(5) memory usage: 490.1+ KB
df.shape
(2240, 28)
# Checking the summary of the quantitative variables.
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 2240.00 | 5592.16 | 3246.66 | 0.00 | 2828.25 | 5458.50 | 8427.75 | 11191.00 |
| Year_Birth | 2240.00 | 1968.81 | 11.98 | 1893.00 | 1959.00 | 1970.00 | 1977.00 | 1996.00 |
| Kidhome | 2240.00 | 0.44 | 0.54 | 0.00 | 0.00 | 0.00 | 1.00 | 2.00 |
| Teenhome | 2240.00 | 0.51 | 0.54 | 0.00 | 0.00 | 0.00 | 1.00 | 2.00 |
| Recency | 2240.00 | 49.11 | 28.96 | 0.00 | 24.00 | 49.00 | 74.00 | 99.00 |
| MntWines | 2240.00 | 303.94 | 336.60 | 0.00 | 23.75 | 173.50 | 504.25 | 1493.00 |
| MntFruits | 2240.00 | 26.30 | 39.77 | 0.00 | 1.00 | 8.00 | 33.00 | 199.00 |
| MntMeatProducts | 2240.00 | 166.95 | 225.72 | 0.00 | 16.00 | 67.00 | 232.00 | 1725.00 |
| MntFishProducts | 2240.00 | 37.53 | 54.63 | 0.00 | 3.00 | 12.00 | 50.00 | 259.00 |
| MntSweetProducts | 2240.00 | 27.06 | 41.28 | 0.00 | 1.00 | 8.00 | 33.00 | 263.00 |
| MntGoldProds | 2240.00 | 44.02 | 52.17 | 0.00 | 9.00 | 24.00 | 56.00 | 362.00 |
| NumDealsPurchases | 2240.00 | 2.33 | 1.93 | 0.00 | 1.00 | 2.00 | 3.00 | 15.00 |
| NumWebPurchases | 2240.00 | 4.08 | 2.78 | 0.00 | 2.00 | 4.00 | 6.00 | 27.00 |
| NumCatalogPurchases | 2240.00 | 2.66 | 2.92 | 0.00 | 0.00 | 2.00 | 4.00 | 28.00 |
| NumStorePurchases | 2240.00 | 5.79 | 3.25 | 0.00 | 3.00 | 5.00 | 8.00 | 13.00 |
| NumWebVisitsMonth | 2240.00 | 5.32 | 2.43 | 0.00 | 3.00 | 6.00 | 7.00 | 20.00 |
| AcceptedCmp3 | 2240.00 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| AcceptedCmp4 | 2240.00 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| AcceptedCmp5 | 2240.00 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| AcceptedCmp1 | 2240.00 | 0.06 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| AcceptedCmp2 | 2240.00 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Response | 2240.00 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Complain | 2240.00 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
# Checking the total product sales
df.iloc[:, 10:16].sum()
MntFruits 58917 MntMeatProducts 373968 MntFishProducts 84057 MntSweetProducts 60621 MntGoldProds 98609 NumDealsPurchases 5208 dtype: int64
# Checking the number of purchases from different sales channels.
df.iloc[:,17:21].sum()
NumCatalogPurchases 5963 NumStorePurchases 12970 NumWebVisitsMonth 11909 AcceptedCmp3 163 dtype: int64
# Checking the summary of the categorical variables.
df.describe(include='O')
| Education | Marital_Status | Income | Dt_Customer | Country | |
|---|---|---|---|---|---|
| count | 2240 | 2240 | 2216 | 2240 | 2240 |
| unique | 5 | 8 | 1974 | 663 | 8 |
| top | Graduation | Married | $7,500.00 | 8/31/12 | SP |
| freq | 1127 | 864 | 12 | 12 | 1095 |
df['Education'].value_counts().plot(kind='barh', figsize=(6,4), edgecolor=(0,0,0),color='blue', title='Education')
<AxesSubplot:title={'center':'Education'}>
df['Marital_Status'].value_counts().plot(kind='barh', figsize=(6,4), edgecolor=(0,0,0),color='green', title='Marital Status')
<AxesSubplot:title={'center':'Marital Status'}>
# Checking null values.
df.isna().sum()
ID 0 Year_Birth 0 Education 0 Marital_Status 0 Income 24 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Response 0 Complain 0 Country 0 dtype: int64
# Formatting the Income column for the correct data type.
df.rename(columns = {' Income ':'Income'}, inplace = True)
df['Income'] = df['Income'].str.replace(',','')
df['Income'] = df['Income'].str.replace('$','').astype(float)
# Checking the income distribution
sns.displot(df['Income'], kde=True)
<seaborn.axisgrid.FacetGrid at 0x2553e1479d0>
plt.figure(figsize=(10,5))
sns.distplot(df.Income, kde=False)
plt.ylabel('Count')
plt.title('Distribution of Income', size=15)
Text(0.5, 1.0, 'Distribution of Income')
# Checking the numerical summary
df['Income'].describe()
count 2216.00 mean 52247.25 std 25173.08 min 1730.00 25% 35303.00 50% 51381.50 75% 68522.00 max 666666.00 Name: Income, dtype: float64
# Imputing missing values
df['Income'].fillna((df['Income'].median()), inplace=True)
# As described earlier converting the DT_Customer column to the pandas datetime format.
df["Dt_Customer"]= pd.to_datetime(df["Dt_Customer"])
df["Dt_Customer"]
0 2014-06-16
1 2014-06-15
2 2014-05-13
3 2014-05-11
4 2014-04-08
...
2235 2013-03-07
2236 2013-01-22
2237 2012-12-03
2238 2012-11-29
2239 2012-09-01
Name: Dt_Customer, Length: 2240, dtype: datetime64[ns]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2240 non-null float64 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null datetime64[ns] 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 AcceptedCmp3 2240 non-null int64 21 AcceptedCmp4 2240 non-null int64 22 AcceptedCmp5 2240 non-null int64 23 AcceptedCmp1 2240 non-null int64 24 AcceptedCmp2 2240 non-null int64 25 Response 2240 non-null int64 26 Complain 2240 non-null int64 27 Country 2240 non-null object dtypes: datetime64[ns](1), float64(1), int64(23), object(3) memory usage: 490.1+ KB
# Inserting a new column for age and obtaining the value by subtracting the latest completed year i.e 2021 from the year of birth.
df.insert(2, 'age', (2021 - df['Year_Birth']))
df.head()
| ID | Year_Birth | age | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1826 | 1970 | 51 | Graduation | Divorced | 84835.00 | 0 | 0 | 2014-06-16 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
| 1 | 1 | 1961 | 60 | Graduation | Single | 57091.00 | 0 | 0 | 2014-06-15 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA |
| 2 | 10476 | 1958 | 63 | Graduation | Married | 67267.00 | 0 | 1 | 2014-05-13 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
| 3 | 1386 | 1967 | 54 | Graduation | Together | 32474.00 | 1 | 1 | 2014-05-11 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 4 | 5371 | 1989 | 32 | Graduation | Single | 21474.00 | 1 | 0 | 2014-04-08 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
df['age'].describe()
count 2240.00 mean 52.19 std 11.98 min 25.00 25% 44.00 50% 51.00 75% 62.00 max 128.00 Name: age, dtype: float64
# Creating a new column for the total purchase amount.
df.insert(16, 'total', 0)
# Adding the total for different products to obtain the total purchase amount.
df['total'] = df.iloc[:, 10:17].sum(axis = 1)
# Creating a subsection of the dataframe to calculate the proportion of different products to the total purchase amount.
products = df.iloc[:, 10:17]
products = products.sum().to_frame()
products = (products / products.loc['total']) * 100
products.columns = ['Percentage']
print(products)
Percentage MntWines 50.17 MntFruits 4.34 MntMeatProducts 27.56 MntFishProducts 6.19 MntSweetProducts 4.47 MntGoldProds 7.27 total 100.00
# Creating a pie chart to show the proportion of purchase amount for the different products.
products = products.drop(['total'])
colors = sns.color_palette('pastel')[0:6]
labels = ['Wines', 'Fruits', 'Meat Products', ' Fish Products', 'Sweet Products', 'Gold']
plt.pie(products['Percentage'], labels = labels, colors = colors, autopct='%.0f%%')
plt.show()
# Creating a new column for the total number of purchases.
df.insert(21, 'total_purchase', 0)
# Calculating the total number of purchases.
df['total_purchase'] = df.iloc[:, 17:21].sum(axis = 1)
# Creating a subsection of the dataframe to calculate the percentage for the different sales channels.
sales_channel = df.iloc[:, 17:22]
sales_channel = sales_channel.sum().to_frame()
sales_channel = (sales_channel / sales_channel.loc['total_purchase']) * 100
sales_channel.columns = ['Percentage']
print(sales_channel)
Percentage NumDealsPurchases 15.64 NumWebPurchases 27.48 NumCatalogPurchases 17.91 NumStorePurchases 38.96 total_purchase 100.00
# Creating a pie chart to depict the share of different sales channels.
sales_channel = sales_channel.drop(['total_purchase'])
colors = sns.color_palette('pastel')[0:4]
labels = ['Deals', 'Web', 'Catalog', 'Stores']
plt.pie(sales_channel['Percentage'], labels = labels, colors = colors, autopct='%.0f%%')
plt.show()
# Creating a subsection of the dataframe to analyze the marketing campaigns.
campaigns = df.iloc[:, 23:29]
campaigns = campaigns.sum().to_frame()
campaigns.columns = ['Frequency']
campaigns
| Frequency | |
|---|---|
| AcceptedCmp3 | 163 |
| AcceptedCmp4 | 167 |
| AcceptedCmp5 | 163 |
| AcceptedCmp1 | 144 |
| AcceptedCmp2 | 30 |
| Response | 334 |
fig = plt.figure(figsize = (10, 5))
labels = campaigns.index.to_list()
plt.bar(labels, campaigns['Frequency'], color ='blue', width = 0.4)
plt.xlabel("Campaigns")
plt.ylabel("Frequency of Accepted Campaigns")
plt.title("Campaigns Accepted")
plt.show()
campaigns2 = pd.DataFrame(df[['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']].mean()*100, columns=['Percent']).reset_index()
px.bar(campaigns2.sort_values('Percent'),x='Percent',y='index',labels ={'index':'Campaign','Percent':'Accepted (%)'},title ='Campaign Acceptance Rate')
# Creating a subset of the dataset for the different products.
products2 = df[['MntWines', 'MntFruits','MntMeatProducts', 'MntFishProducts','MntSweetProducts' ,'MntGoldProds']]
products2.sum()
MntWines 680816 MntFruits 58917 MntMeatProducts 373968 MntFishProducts 84057 MntSweetProducts 60621 MntGoldProds 98609 dtype: int64
products2.sum().plot(kind='bar',figsize=(6,5),edgecolor=(0,0,0), color=colors, rot=45)
<AxesSubplot:>
average_spend = pd.DataFrame(round(df[[col for col in df.columns if 'Mnt' in col]].mean()), columns = ['Average_Spend']).reset_index()
px.bar(average_spend.sort_values('Average_Spend'),x= 'Average_Spend',y='index',labels ={'index': 'Product',"Average_Spend":'Amount Spent'},title ='Average spending on products')
# Grouping the total acceptance of the last campaign to the ages of the customer.
lastcampaign = df.groupby('age')['Response'].sum()
# Plotting it on a scatter plot
sns.relplot(df['age'],df.groupby('age')['Response'].sum(), data =df)
<seaborn.axisgrid.FacetGrid at 0x255400d0fd0>
lastcampaign = lastcampaign.reset_index()
correlation = lastcampaign['age'].corr(lastcampaign['Response'])
correlation
-0.28636792601520616
# Grouping the dataframe by country and summing the responses.
q11 = df.groupby('Country')['Response'].sum()
q11 = q11.to_frame()
q11
| Response | |
|---|---|
| Country | |
| AUS | 23 |
| CA | 38 |
| GER | 17 |
| IND | 13 |
| ME | 2 |
| SA | 52 |
| SP | 176 |
| US | 13 |
# Plotting the above dataset
sns.barplot(x = q11.index, y = q11['Response'])
<AxesSubplot:xlabel='Country', ylabel='Response'>
# Creating a subsection of the columns we require for this analysis.
dependents = df[['Kidhome', 'Teenhome', 'total', 'total_purchase']]
dependents['Dependents'] = dependents['Kidhome'] + dependents['Teenhome']
dependents = dependents.drop(['Kidhome', 'Teenhome'], axis = 1)
# Comparing the Total amount spent Vs Dependents
d1 = dependents.groupby('Dependents')['total'].sum().to_frame().reset_index()
d1
| Dependents | total | |
|---|---|---|
| 0 | 0 | 705647 |
| 1 | 1 | 533243 |
| 2 | 2 | 103544 |
| 3 | 3 | 14554 |
plt.figure(figsize=(12,4))
sns.barplot(x=d1.Dependents,y=d1.total, palette='Blues')
plt.title('Number of Dependents Vs Total Amount Spent')
Text(0.5, 1.0, 'Number of Dependents Vs Total Amount Spent')
# Comparing the Total Purchases Vs Dependents
d2 = dependents.groupby('Dependents')['total_purchase'].sum().to_frame().reset_index()
d2
| Dependents | total_purchase | |
|---|---|---|
| 0 | 0 | 11201 |
| 1 | 1 | 16334 |
| 2 | 2 | 5137 |
| 3 | 3 | 619 |
plt.figure(figsize=(12,4))
sns.barplot(x=d2.Dependents,y=d2.total_purchase, palette='Blues')
plt.title('Number of Dependents Vs Total Number of Purchases')
Text(0.5, 1.0, 'Number of Dependents Vs Total Number of Purchases')
# Selecting the variables for correlation analysis
corr1 = df[['age', 'Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'total', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'total_purchase', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Response', 'Complain']]
# Creating a correlation matrix
corr1 = corr1.corr()
sns.heatmap(corr1,cmap='plasma',vmin=-1,vmax=1)
plt.show()
corr2 = corr1.unstack().sort_values()
corr2 = corr2.to_frame().reset_index()
# Checking values of high negative correlation
corr2[corr2[0] < - 0.50]
| level_0 | level_1 | 0 | |
|---|---|---|---|
| 0 | Kidhome | total | -0.56 |
| 1 | total | Kidhome | -0.56 |
| 2 | NumWebVisitsMonth | Income | -0.55 |
| 3 | Income | NumWebVisitsMonth | -0.55 |
| 4 | NumWebVisitsMonth | MntMeatProducts | -0.54 |
| 5 | MntMeatProducts | NumWebVisitsMonth | -0.54 |
| 6 | NumWebVisitsMonth | NumCatalogPurchases | -0.52 |
| 7 | NumCatalogPurchases | NumWebVisitsMonth | -0.52 |
| 8 | Kidhome | NumCatalogPurchases | -0.50 |
| 9 | NumCatalogPurchases | Kidhome | -0.50 |
| 10 | total | NumWebVisitsMonth | -0.50 |
| 11 | NumWebVisitsMonth | total | -0.50 |
# Checking values of high positive correlation
corr2[corr2[0].between (0.60, 0.99)]
| level_0 | level_1 | 0 | |
|---|---|---|---|
| 568 | total | MntSweetProducts | 0.60 |
| 569 | MntSweetProducts | total | 0.60 |
| 570 | MntFruits | total | 0.61 |
| 571 | total | MntFruits | 0.61 |
| 572 | MntWines | NumCatalogPurchases | 0.64 |
| 573 | NumCatalogPurchases | MntWines | 0.64 |
| 574 | NumStorePurchases | MntWines | 0.64 |
| 575 | MntWines | NumStorePurchases | 0.64 |
| 576 | MntFishProducts | total | 0.64 |
| 577 | total | MntFishProducts | 0.64 |
| 578 | Income | total | 0.66 |
| 579 | total | Income | 0.66 |
| 580 | total | NumStorePurchases | 0.67 |
| 581 | NumStorePurchases | total | 0.67 |
| 582 | total_purchase | MntWines | 0.71 |
| 583 | MntWines | total_purchase | 0.71 |
| 584 | NumCatalogPurchases | MntMeatProducts | 0.72 |
| 585 | MntMeatProducts | NumCatalogPurchases | 0.72 |
| 586 | total_purchase | NumCatalogPurchases | 0.74 |
| 587 | NumCatalogPurchases | total_purchase | 0.74 |
| 588 | total | total_purchase | 0.75 |
| 589 | total_purchase | total | 0.75 |
| 590 | total_purchase | NumWebPurchases | 0.78 |
| 591 | NumWebPurchases | total_purchase | 0.78 |
| 592 | NumCatalogPurchases | total | 0.78 |
| 593 | total | NumCatalogPurchases | 0.78 |
| 594 | NumStorePurchases | total_purchase | 0.82 |
| 595 | total_purchase | NumStorePurchases | 0.82 |
| 596 | total | MntMeatProducts | 0.84 |
| 597 | MntMeatProducts | total | 0.84 |
| 598 | total | MntWines | 0.89 |
| 599 | MntWines | total | 0.89 |
# Grouping the education levels of the customer according to the total number of complaints.
ed_v_comp = df.groupby('Education')['Complain'].sum().to_frame().reset_index()
ed_v_comp
| Education | Complain | |
|---|---|---|
| 0 | 2n Cycle | 4 |
| 1 | Basic | 0 |
| 2 | Graduation | 14 |
| 3 | Master | 2 |
| 4 | PhD | 1 |
sns.set(style="ticks", color_codes=True)
sns.pairplot(df[['MntWines','MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds','total', 'Response']], hue = 'Response')
<seaborn.axisgrid.PairGrid at 0x255407e75b0>
sns.pairplot(df[['MntWines','MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds','total', 'Education']], hue = 'Education')
<seaborn.axisgrid.PairGrid at 0x2554080cf70>
sns.pairplot(df[['MntWines','MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds','total', 'Marital_Status']], hue = 'Marital_Status')
<seaborn.axisgrid.PairGrid at 0x255472466a0>